In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
import statsmodels.api as sm
from statsmodels.formula.api import ols

Load the data

In [2]:
crime = pd.read_csv('formatted.csv', sep='\s*,\s*',encoding='latin-1',engine='python', na_values=["?"])
In [3]:
crime.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2218 entries, 0 to 2217
Columns: 151 entries, Unnamed: 0 to Division
dtypes: float64(145), int64(1), object(5)
memory usage: 2.6+ MB
In [4]:
crime.head()
Out[4]:
Unnamed: 0 communityName statecode countyCode communityCode fold population householdsize racepctblack racePctWhite ... larcPerPop autoTheft autoTheftPerPop arsons arsonsPerPop ViolentCrimesPerPop nonViolPerPop State Region Division
0 0 BerkeleyHeightstownship NJ 39.0 5320.0 1.0 11980.0 3.10 1.37 91.78 ... 1132.08 16.0 131.26 2.0 16.41 41.02 1394.59 New Jersey Northeast Middle Atlantic
1 1 Bricktownship NJ 29.0 7420.0 1.0 66473.0 2.66 0.63 97.81 ... 1773.32 98.0 143.15 14.0 20.45 131.47 2543.13 New Jersey Northeast Middle Atlantic
2 2 ScotchPlainstownship NJ 39.0 66060.0 1.0 21160.0 2.79 11.10 83.79 ... 1271.19 89.0 409.91 2.0 9.21 115.14 2160.10 New Jersey Northeast Middle Atlantic
3 3 Gallowaytownship NJ 1.0 25560.0 1.0 23330.0 2.94 7.36 88.58 ... 2332.30 46.0 192.61 7.0 29.31 251.24 3668.03 New Jersey Northeast Middle Atlantic
4 4 NewProvidenceborough NJ 39.0 51810.0 1.0 11439.0 2.70 0.54 94.18 ... 768.21 18.0 148.69 2.0 16.52 49.56 1032.55 New Jersey Northeast Middle Atlantic

5 rows × 151 columns

In [5]:
crime.columns
Out[5]:
Index(['Unnamed: 0', 'communityName', 'statecode', 'countyCode',
       'communityCode', 'fold', 'population', 'householdsize', 'racepctblack',
       'racePctWhite',
       ...
       'larcPerPop', 'autoTheft', 'autoTheftPerPop', 'arsons', 'arsonsPerPop',
       'ViolentCrimesPerPop', 'nonViolPerPop', 'State', 'Region', 'Division'],
      dtype='object', length=151)

Explore the data: Histograms

Check out min, mx, mean, median, and std for non-violent crimes

In [6]:
print('Minimum for violent crimes: ', crime['ViolentCrimesPerPop'].min())
print('Maximum for violent crimes: ', crime['ViolentCrimesPerPop'].max())
print('Mean for violent crimes: ', crime['ViolentCrimesPerPop'].mean())
print('STD for violent crimes: ', crime['ViolentCrimesPerPop'].std())
print('Median for violent crimes: ', crime['ViolentCrimesPerPop'].median())
Minimum for violent crimes:  0.0
Maximum for violent crimes:  4877.06
Mean for violent crimes:  589.0789217652957
STD for violent crimes:  614.7845182453359
Median for violent crimes:  374.06

Check out min, mx, mean, median, and std for non-violent crimes

In [7]:
print('Minimum for non-violent crimes: ', crime['nonViolPerPop'].min())
print('Maximum for non-violent crimes: ', crime['nonViolPerPop'].max())
print('Mean for non-violent crimes: ', crime['nonViolPerPop'].mean())
print('STD for non-violent crimes: ', crime['nonViolPerPop'].std())
print('Median for non-violent crimes: ', crime['nonViolPerPop'].median())
Minimum for non-violent crimes:  116.79
Maximum for non-violent crimes:  27119.76
Mean for non-violent crimes:  4908.241803588295
STD for non-violent crimes:  2739.7089005280213
Median for non-violent crimes:  4425.450000000001

Violent crime density

In [8]:
plt.figure(figsize=(20,15))

plt.subplot(3,2,1)
plt.title('Violent Crimes density')
crime['ViolentCrimesPerPop'].hist()
plt.xlabel('Number of Violent Crimes per 100k population')
plt.ylabel('count')

plt.subplot(3,2,2)
plt.title('Murder Crimes density')
crime['murdPerPop'].hist()
plt.xlabel('Number of Murders per 100k population')
plt.ylabel('count')

plt.subplot(3,2,3)
plt.title('Rape Crimes density')
crime['rapesPerPop'].hist()
plt.xlabel('Number of Rapes per 100k population')
plt.ylabel('count')

plt.subplot(3,2,4)

plt.title('Robbery Crimes density')
crime['robbbPerPop'].hist()
plt.xlabel('Number of Robberies per 100K population')
plt.ylabel('count')

plt.subplot(3,2,5)
plt.title('Assault Crimes density')
crime['assaultPerPop'].hist()
plt.xlabel('Number of Assaults per 100k population')
plt.ylabel('count')
Out[8]:
Text(0, 0.5, 'count')

Non-violent crime density

In [9]:
plt.figure(figsize=(20,15))

plt.subplot(3,2,1)
plt.title('Non-Violent Crimes density')
crime['nonViolPerPop'].hist()
plt.xlabel('Number of Non-Violent Crimes per 100k population')
plt.ylabel('count')

plt.subplot(3,2,2)
plt.title('Arson Crimes density')
crime['arsonsPerPop'].hist()
plt.xlabel('Arsons density')
plt.ylabel('count')

plt.subplot(3,2,3)
plt.title('Burglary Crimes density')
crime['burglPerPop'].hist()
plt.xlabel('Burglaries density')
plt.ylabel('count')

plt.subplot(3,2,4)
plt.title('Larceny Crimes density')
crime['larcPerPop'].hist()
plt.xlabel('Larcenies density')
plt.ylabel('Counts')

plt.subplot(3,2,5)
plt.title('Auto-theft Crimes density')
crime['autoTheftPerPop'].hist()
plt.xlabel('Auto thefts density')
plt.ylabel('Counts')
Out[9]:
Text(0, 0.5, 'Counts')

This is a smart way of visualizing density for each type of crime and see which one is most widespread

Check per capita income for each race

In [10]:
plt.figure(figsize=(20,15))

plt.subplot(3,2,1)
plt.title('whitePerCap')
crime['whitePerCap'].hist()
plt.xlabel('Per capita income for caucasians')
plt.ylabel('count')

plt.subplot(3,2,2)
plt.title('blackPerCap')
crime['blackPerCap'].hist()
plt.xlabel('Per capita income for blacks')
plt.ylabel('count')

plt.subplot(3,2,3)
plt.title('indianPerCap')
crime['indianPerCap'].hist()
plt.xlabel('Per capita income for indians')
plt.ylabel('count')

plt.subplot(3,2,4)
plt.title('AsianPerCap')
crime['AsianPerCap'].hist()
plt.xlabel('Per capita income for asians')
plt.ylabel('count')

plt.subplot(3,2,5)
plt.title('HispPerCap')
crime['HispPerCap'].hist()
plt.xlabel('Per capita income for hispanics')
plt.ylabel('count')
Out[10]:
Text(0, 0.5, 'count')

Exploring data through linear regression plots

Plot linear regression plots for %age and non-violent crimes

In [11]:
sns.lmplot(x='agePct12t21', y='nonViolPerPop', data=crime)
sns.lmplot(x='agePct12t29', y='nonViolPerPop', data=crime)
sns.lmplot(x='agePct16t24', y='nonViolPerPop', data=crime)
sns.lmplot(x='agePct65up', y='nonViolPerPop', data=crime)
Out[11]:
<seaborn.axisgrid.FacetGrid at 0x25c5876c340>
In [14]:
# calculate correlation coeffcient between %age and non-violent crimes
crime1 = crime.dropna(subset=['agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up', \
                              'agePop12t21', 'agePop12t29', 'agePop16t24', 'agePop65up',
                              'nonViolPerPop', 'ViolentCrimesPerPop'], how='any')
print('Correlation coefficient for ages 12-21: ', scipy.stats.pearsonr(crime1['agePct12t21'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for ages 12-29: ', scipy.stats.pearsonr(crime1['agePct12t29'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for ages 16-24: ', scipy.stats.pearsonr(crime1['agePct16t24'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for ages 65+: ', scipy.stats.pearsonr(crime1['agePct65up'], crime1['nonViolPerPop'])[0])
Correlation coefficient for ages 12-21:  0.023780025891635352
Correlation coefficient for ages 12-29:  0.11128019835681054
Correlation coefficient for ages 16-24:  0.06647804421706746
Correlation coefficient for ages 65+:  0.126582344754042

Plot linear regression plots for population age and non-violent crimes

In [13]:
crime['agePop12t21'] = crime['agePct12t21'] * crime['population']
crime['agePop12t29'] = crime['agePct12t29'] * crime['population']
crime['agePop16t24'] = crime['agePct16t24'] * crime['population']
crime['agePop65up'] = crime['agePct65up'] * crime['population']
In [15]:
sns.lmplot(x='agePop12t21', y='nonViolPerPop', data=crime)
sns.lmplot(x='agePop12t29', y='nonViolPerPop', data=crime)
sns.lmplot(x='agePop16t24', y='nonViolPerPop', data=crime)
sns.lmplot(x='agePop65up', y='nonViolPerPop', data=crime)
Out[15]:
<seaborn.axisgrid.FacetGrid at 0x25c594c71c0>
In [16]:
# calculate correlation coefficient between population age and non-violent crimes
print('Correlation coefficient for ages 12-21: ', scipy.stats.pearsonr(crime1['agePop12t21'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for ages 12-29: ', scipy.stats.pearsonr(crime1['agePop12t29'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for ages 16-24: ', scipy.stats.pearsonr(crime1['agePop16t24'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for ages 65+: ', scipy.stats.pearsonr(crime1['agePop65up'], crime1['nonViolPerPop'])[0])
Correlation coefficient for ages 12-21:  0.12683367590715233
Correlation coefficient for ages 12-29:  0.1252028812333492
Correlation coefficient for ages 16-24:  0.13008854745491244
Correlation coefficient for ages 65+:  0.12455724077253921

Plot linear regression for % age and violent crimes

In [18]:
sns.lmplot(x='agePct12t21', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='agePct12t29', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='agePct16t24', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='agePct65up', y='ViolentCrimesPerPop', data=crime)
Out[18]:
<seaborn.axisgrid.FacetGrid at 0x25c5b2e63a0>
In [20]:
# calculate correlation coeffcient between %age and violent crimes
print('Correlation coefficient for ages 12-21: ', scipy.stats.pearsonr(crime1['agePct12t21'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for ages 12-29: ', scipy.stats.pearsonr(crime1['agePct12t29'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for ages 16-24: ', scipy.stats.pearsonr(crime1['agePct16t24'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for ages 65+: ', scipy.stats.pearsonr(crime1['agePct65up'], crime1['ViolentCrimesPerPop'])[0])
Correlation coefficient for ages 12-21:  0.02202395356565119
Correlation coefficient for ages 12-29:  0.10993960791346309
Correlation coefficient for ages 16-24:  0.04841759061410261
Correlation coefficient for ages 65+:  0.05396574839211375

Plot linear regression plots for population age and violent crimes

In [21]:
sns.lmplot(x='agePop12t21', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='agePop12t29', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='agePop16t24', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='agePop65up', y='ViolentCrimesPerPop', data=crime)
Out[21]:
<seaborn.axisgrid.FacetGrid at 0x25c5b41c040>
In [22]:
# calculate correlation coeffcient between %age and violent crimes
print('Correlation coefficient for ages 12-21: ', scipy.stats.pearsonr(crime1['agePop12t21'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for ages 12-29: ', scipy.stats.pearsonr(crime1['agePop12t29'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for ages 16-24: ', scipy.stats.pearsonr(crime1['agePop16t24'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for ages 65+: ', scipy.stats.pearsonr(crime1['agePop65up'], crime1['ViolentCrimesPerPop'])[0])
Correlation coefficient for ages 12-21:  0.2201235905280936
Correlation coefficient for ages 12-29:  0.2188657592482586
Correlation coefficient for ages 16-24:  0.2226443146601304
Correlation coefficient for ages 65+:  0.2117114786134144

Plot linear regression plots for %education and non-violent crimes

In [23]:
sns.lmplot(x='PctLess9thGrade', y='nonViolPerPop', data=crime)
sns.lmplot(x='PctNotHSGrad', y='nonViolPerPop', data=crime)
sns.lmplot(x='PctBSorMore', y='nonViolPerPop', data=crime)
Out[23]:
<seaborn.axisgrid.FacetGrid at 0x25c5b3f6400>
In [27]:
# calculate correlation coeffcient between %education and non-violent crimes
crime1 = crime.dropna(subset=['agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up', 
                              'agePop12t21', 'agePop12t29', 'agePop16t24', 'agePop65up',
                              'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
                              'PopLess9thGrade', 'PopNotHSGrad', 'PopBSorMore',
                              'nonViolPerPop', 'ViolentCrimesPerPop'], how='any')
print('Correlation coefficient for PctLess9thGrade: ', scipy.stats.pearsonr(crime1['PctLess9thGrade'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PctNotHSGrad: ', scipy.stats.pearsonr(crime1['PctNotHSGrad'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PctBSorMore: ', scipy.stats.pearsonr(crime1['PctBSorMore'], crime1['nonViolPerPop'])[0])
Correlation coefficient for PctLess9thGrade:  0.28784927687473005
Correlation coefficient for PctNotHSGrad:  0.36650015753649645
Correlation coefficient for PctBSorMore:  -0.27101682578840325

Plot linear regression plots for population education and non-violent crimes

In [28]:
crime['PopLess9thGrade'] = crime['PctLess9thGrade'] * crime['population']
crime['PopNotHSGrad'] = crime['PctNotHSGrad'] * crime['population']
crime['PopBSorMore'] = crime['PctBSorMore'] * crime['population']
In [29]:
sns.lmplot(x='PopLess9thGrade', y='nonViolPerPop', data=crime)
sns.lmplot(x='PopNotHSGrad', y='nonViolPerPop', data=crime)
sns.lmplot(x='PopBSorMore', y='nonViolPerPop', data=crime)
Out[29]:
<seaborn.axisgrid.FacetGrid at 0x25c5956c2e0>
In [30]:
# calculate correlation coeffcient between population education and non-violent crimes
print('Correlation coefficient for PopLess9thGrade: ', scipy.stats.pearsonr(crime1['PopLess9thGrade'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PopNotHSGrad: ', scipy.stats.pearsonr(crime1['PopNotHSGrad'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PopBSorMore: ', scipy.stats.pearsonr(crime1['PopBSorMore'], crime1['nonViolPerPop'])[0])
Correlation coefficient for PopLess9thGrade:  0.11147481298678569
Correlation coefficient for PopNotHSGrad:  0.11912455062613178
Correlation coefficient for PopBSorMore:  0.09851144679694109

Plot linear regression plots for %education and non-violent crimes

In [31]:
sns.lmplot(x='PctLess9thGrade', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PctNotHSGrad', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PctBSorMore', y='ViolentCrimesPerPop', data=crime)
Out[31]:
<seaborn.axisgrid.FacetGrid at 0x25c58c9c280>
In [32]:
# calculate correlation coeffcient between %education and violent crimes
print('Correlation coefficient for PctLess9thGrade: ', scipy.stats.pearsonr(crime1['PctLess9thGrade'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PctNotHSGrad: ', scipy.stats.pearsonr(crime1['PctNotHSGrad'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PctBSorMore: ', scipy.stats.pearsonr(crime1['PctBSorMore'], crime1['ViolentCrimesPerPop'])[0])
Correlation coefficient for PctLess9thGrade:  0.37080716309505024
Correlation coefficient for PctNotHSGrad:  0.46651461611308775
Correlation coefficient for PctBSorMore:  -0.2992900545785156

Plot linear regression plots for population education and violent crimes

In [33]:
sns.lmplot(x='PopLess9thGrade', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PopNotHSGrad', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PopBSorMore', y='ViolentCrimesPerPop', data=crime)
Out[33]:
<seaborn.axisgrid.FacetGrid at 0x25c58b34c10>
In [34]:
# calculate correlation coeffcient between population education and violent crimes
print('Correlation coefficient for PopLess9thGrade: ', scipy.stats.pearsonr(crime1['PopLess9thGrade'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PopNotHSGrad: ', scipy.stats.pearsonr(crime1['PopNotHSGrad'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PopBSorMore: ', scipy.stats.pearsonr(crime1['PopBSorMore'], crime1['ViolentCrimesPerPop'])[0])
Correlation coefficient for PopLess9thGrade:  0.21514133095190577
Correlation coefficient for PopNotHSGrad:  0.2247424530842106
Correlation coefficient for PopBSorMore:  0.17883148768814133

Plot linear regression plots for %employment/unemployment and non-violent crimes

In [35]:
sns.lmplot(x='PctEmploy', y='nonViolPerPop', data=crime)
sns.lmplot(x='PctUnemployed', y='nonViolPerPop', data=crime)
Out[35]:
<seaborn.axisgrid.FacetGrid at 0x25c58b348e0>
In [40]:
# calculate correlation coefficient between %employed/unemployed and non-violent crimes
crime1 = crime.dropna(subset=['agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up', 
                              'agePop12t21', 'agePop12t29', 'agePop16t24', 'agePop65up',
                              'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
                              'PopLess9thGrade', 'PopNotHSGrad', 'PopBSorMore',
                              'PctUnemployed', 'PctEmploy',
                              'PopUnemployed', 'PopEmploy',
                              'nonViolPerPop', 'ViolentCrimesPerPop'], how='any')
print('Correlation coefficient for PctUnemployed: ', scipy.stats.pearsonr(crime1['PctUnemployed'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PctEmploy: ', scipy.stats.pearsonr(crime1['PctEmploy'], crime1['nonViolPerPop'])[0])
Correlation coefficient for PctUnemployed:  0.3920850019155378
Correlation coefficient for PctEmploy:  -0.30471049193594246

Plot linear regression plots for population employment/unemploymet and non-violent crimes

In [41]:
crime['PopEmploy'] = crime['PctEmploy'] * crime['population']
crime['PopUnemployed'] = crime['PctUnemployed'] * crime['population']
In [42]:
sns.lmplot(x='PopEmploy', y='nonViolPerPop', data=crime)
sns.lmplot(x='PopUnemployed', y='nonViolPerPop', data=crime)
Out[42]:
<seaborn.axisgrid.FacetGrid at 0x25c5b31cac0>
In [43]:
# Calculate correlation coefficient for population employed/unemployed and non-violent crimes
print('Correlation coefficient for PopUnemployed: ', scipy.stats.pearsonr(crime1['PopUnemployed'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PopEmploy: ', scipy.stats.pearsonr(crime1['PopEmploy'], crime1['nonViolPerPop'])[0])
Correlation coefficient for PopUnemployed:  0.1217417798243135
Correlation coefficient for PopEmploy:  0.11749062427354048

Plot linear regression plots for %employment/unemployment and violent crimes

In [44]:
sns.lmplot(x='PctEmploy', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PctUnemployed', y='ViolentCrimesPerPop', data=crime)
Out[44]:
<seaborn.axisgrid.FacetGrid at 0x25c5be6c400>
In [45]:
# Calculate correlation coefficient for %employed/unemployed and violent crimes
print('Correlation coefficient for PctUnemployed: ', scipy.stats.pearsonr(crime1['PctUnemployed'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PctEmploy: ', scipy.stats.pearsonr(crime1['PctEmploy'], crime1['ViolentCrimesPerPop'])[0])
Correlation coefficient for PctUnemployed:  0.4749680398078534
Correlation coefficient for PctEmploy:  -0.31226118672258435

Plot linear regression plots for population employment/unemploymet and violent crimes

In [46]:
sns.lmplot(x='PopEmploy', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PopUnemployed', y='ViolentCrimesPerPop', data=crime)
Out[46]:
<seaborn.axisgrid.FacetGrid at 0x25c5bb32970>
In [47]:
# Calculate correlation coefficient for population employed/unemployed and violent crimes
print('Correlation coefficient for PopUnemployed: ', scipy.stats.pearsonr(crime1['PopUnemployed'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PopEmploy: ', scipy.stats.pearsonr(crime1['PopEmploy'], crime1['ViolentCrimesPerPop'])[0])
Correlation coefficient for PopUnemployed:  0.22277532593518973
Correlation coefficient for PopEmploy:  0.20954788198015378

Plot linear regression plots for %vacancy and non-violent crimes

In [48]:
sns.lmplot(x='PctHousOccup', y='nonViolPerPop', data=crime)
sns.lmplot(x='PctHousOwnOcc', y='nonViolPerPop', data=crime)
sns.lmplot(x='PctVacantBoarded', y='nonViolPerPop', data=crime)
sns.lmplot(x='PctVacMore6Mos', y='nonViolPerPop', data=crime)
Out[48]:
<seaborn.axisgrid.FacetGrid at 0x25c5bb940a0>
In [81]:
# calculate correlation coefficient between %vacancy and non-violent crimes
crime1 = crime.dropna(subset=['agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up', 
                              'agePop12t21', 'agePop12t29', 'agePop16t24', 'agePop65up',
                              'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
                              'PopLess9thGrade', 'PopNotHSGrad', 'PopBSorMore',
                              'PctUnemployed', 'PctEmploy',
                              'PopUnemployed', 'PopEmploy',
                              'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctVacMore6Mos',
                              'PopHousOccup', 'PopHousOwnOcc', 'PopVacantBoarded', 'PopVacMore6Mos',
                              'nonViolPerPop', 'ViolentCrimesPerPop'], how='any')
print('Correlation coefficient for PctHousOccup: ', scipy.stats.pearsonr(crime1['PctHousOccup'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PctHousOwnOcc: ', scipy.stats.pearsonr(crime1['PctHousOwnOcc'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PctVacantBoarded: ', scipy.stats.pearsonr(crime1['PctVacantBoarded'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PctVacMore6Mos: ', scipy.stats.pearsonr(crime1['PctVacMore6Mos'], crime1['nonViolPerPop'])[0])
Correlation coefficient for PctHousOccup:  -0.3039032395515144
Correlation coefficient for PctHousOwnOcc:  -0.4622358628933084
Correlation coefficient for PctVacantBoarded:  0.32367867144782136
Correlation coefficient for PctVacMore6Mos:  -0.04302596621892053

Plot linear regression plots for population vacancy and non-violent crimes

In [82]:
crime['PopHousOccup'] = crime['PctHousOccup'] * crime['population']
crime['PopHousOwnOcc'] = crime['PctHousOwnOcc'] * crime['population']
crime['PopVacantBoarded'] = crime['PctVacantBoarded'] * crime['population']
crime['PopVacMore6Mos'] = crime['PctVacMore6Mos'] * crime['population']
In [83]:
sns.lmplot(x='PopHousOccup', y='nonViolPerPop', data=crime)
sns.lmplot(x='PopHousOwnOcc', y='nonViolPerPop', data=crime)
sns.lmplot(x='PopVacantBoarded', y='nonViolPerPop', data=crime)
sns.lmplot(x='PopVacMore6Mos', y='nonViolPerPop', data=crime)
Out[83]:
<seaborn.axisgrid.FacetGrid at 0x25c586bf040>
In [85]:
# calculate correlation coefficient for population vacancy and non-violent crimes
print('Correlation coefficient for PopHousOccup: ', scipy.stats.pearsonr(crime1['PopHousOccup'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PopHousOwnOcc: ', scipy.stats.pearsonr(crime1['PopHousOwnOcc'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PopVacantBoarded: ', scipy.stats.pearsonr(crime1['PopVacantBoarded'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for PopVacMore6Mos: ', scipy.stats.pearsonr(crime1['PopVacMore6Mos'], crime1['nonViolPerPop'])[0])
Correlation coefficient for PopHousOccup:  0.1133129368532727
Correlation coefficient for PopHousOwnOcc:  0.13714579037718938
Correlation coefficient for PopVacantBoarded:  0.1531535526401337
Correlation coefficient for PopVacMore6Mos:  0.1082544467613051

Plot linear regression plots for %vacancy and non-violent crimes

In [86]:
sns.lmplot(x='PctHousOccup', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PctHousOwnOcc', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PctVacantBoarded', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PctVacMore6Mos', y='ViolentCrimesPerPop', data=crime)
Out[86]:
<seaborn.axisgrid.FacetGrid at 0x25c5c58d2b0>
In [88]:
# calculate correlation coefficient between %vacancy and violent crimes
print('Correlation coefficient for PctHousOccup: ', scipy.stats.pearsonr(crime1['PctHousOccup'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PctHousOwnOcc: ', scipy.stats.pearsonr(crime1['PctHousOwnOcc'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PctVacantBoarded: ', scipy.stats.pearsonr(crime1['PctVacantBoarded'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PctVacMore6Mos: ', scipy.stats.pearsonr(crime1['PctVacMore6Mos'], crime1['ViolentCrimesPerPop'])[0])
Correlation coefficient for PctHousOccup:  -0.25554595819128334
Correlation coefficient for PctHousOwnOcc:  -0.46069357769159813
Correlation coefficient for PctVacantBoarded:  0.47510410552705856
Correlation coefficient for PctVacMore6Mos:  0.017526764073398652

Plot linear regression plots for population vacancy and non-violent crimes

In [89]:
sns.lmplot(x='PopHousOccup', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PopHousOwnOcc', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PopVacantBoarded', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='PopVacMore6Mos', y='ViolentCrimesPerPop', data=crime)
Out[89]:
<seaborn.axisgrid.FacetGrid at 0x25c5895f940>
In [91]:
# calculate correlatiion coefficient between population vacancy and violent crimes
print('Correlation coefficient for PopHousOccup: ', scipy.stats.pearsonr(crime1['PopHousOccup'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PopHousOwnOcc: ', scipy.stats.pearsonr(crime1['PopHousOwnOcc'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PopVacantBoarded: ', scipy.stats.pearsonr(crime1['PopVacantBoarded'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for PopVacMore6Mos: ', scipy.stats.pearsonr(crime1['PopVacMore6Mos'], crime1['ViolentCrimesPerPop'])[0])
Correlation coefficient for PopHousOccup:  0.2062343596261637
Correlation coefficient for PopHousOwnOcc:  0.23184561959918992
Correlation coefficient for PopVacantBoarded:  0.2698337506115343
Correlation coefficient for PopVacMore6Mos:  0.19955923301184988

Plot linear regression plot for % race and non-violent crimes

In [92]:
sns.lmplot(x='racepctblack', y='nonViolPerPop', data=crime)
sns.lmplot(x='racePctWhite', y='nonViolPerPop', data=crime)
sns.lmplot(x='racePctAsian', y='nonViolPerPop', data=crime)
sns.lmplot(x='racePctHisp', y='nonViolPerPop', data=crime)
Out[92]:
<seaborn.axisgrid.FacetGrid at 0x25c58a5b5e0>
In [63]:
# calculate correlation coefficient between %race and non-violent crimes
crime1 = crime.dropna(subset=['agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up', 
                              'agePop12t21', 'agePop12t29', 'agePop16t24', 'agePop65up',
                              'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
                              'PopLess9thGrade', 'PopNotHSGrad', 'PopBSorMore',
                              'PctUnemployed', 'PctEmploy',
                              'PopUnemployed', 'PopEmploy',
                              'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctVacMore6Mos',
                              'PopHousOccup', 'PopHousOwnOcc', 'PopVacantBoarded', 'PopVacMore6Mos',
                              'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp',
                              'racepopblack', 'racePopWhite', 'racePopAsian', 'racePopHisp',
                              'nonViolPerPop', 'ViolentCrimesPerPop'], how='any')
print('Correlation coefficient for pctraceblack: ', scipy.stats.pearsonr(crime1['racepctblack'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for pctRaceWhite: ', scipy.stats.pearsonr(crime1['racePctWhite'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for pctRaceAsian: ', scipy.stats.pearsonr(crime1['racePctAsian'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for pctRaceHisp: ', scipy.stats.pearsonr(crime1['racePctHisp'], crime1['nonViolPerPop'])[0])
Correlation coefficient for pctraceblack:  0.4743247060336021
Correlation coefficient for pctRaceWhite:  -0.4765791610681369
Correlation coefficient for pctRaceAsian:  -0.03474179713723831
Correlation coefficient for pctRaceHisp:  0.17462237036514378

Plot linear regression plots for population race and non-violent crimes

In [64]:
crime['racepopblack'] = crime['racepctblack'] * crime['population']
crime['racePopWhite'] = crime['racePctWhite'] * crime['population']
crime['racePopAsian'] = crime['racePctAsian'] * crime['population']
crime['racePopHisp'] = crime['racePctHisp'] * crime['population']
In [65]:
sns.lmplot(x='racepopblack', y='nonViolPerPop', data=crime)
sns.lmplot(x='racePopWhite', y='nonViolPerPop', data=crime)
sns.lmplot(x='racePopAsian', y='nonViolPerPop', data=crime)
sns.lmplot(x='racePopHisp', y='nonViolPerPop', data=crime)
Out[65]:
<seaborn.axisgrid.FacetGrid at 0x25c5c392550>
In [71]:
# calculate correlation coefficient for population race and non-violent crimes
print('Correlation coefficient for popraceblack: ', scipy.stats.pearsonr(crime1['racepopblack'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for popRaceWhite: ', scipy.stats.pearsonr(crime1['racePopWhite'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for popRaceAsian: ', scipy.stats.pearsonr(crime1['racePopAsian'], crime1['nonViolPerPop'])[0])
print('Correlation coefficient for popRaceHisp: ', scipy.stats.pearsonr(crime1['racePopHisp'], crime1['nonViolPerPop'])[0])
Correlation coefficient for popraceblack:  0.1381219744795777
Correlation coefficient for popRaceWhite:  0.12002428600448702
Correlation coefficient for popRaceAsian:  0.05419081407159138
Correlation coefficient for popRaceHisp:  0.08281266109978014

Plot linear regression plot for %race and violent crimes

In [72]:
sns.lmplot(x='racepctblack', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='racePctWhite', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='racePctAsian', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='racePctHisp', y='ViolentCrimesPerPop', data=crime)
Out[72]:
<seaborn.axisgrid.FacetGrid at 0x25c5b42b9d0>
In [74]:
# calculate correlation coefficient for %race and violent crimes
print('Correlation coefficient for pctraceblack: ', scipy.stats.pearsonr(crime1['racepctblack'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for pctRaceWhite: ', scipy.stats.pearsonr(crime1['racePctWhite'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for pctRaceAsian: ', scipy.stats.pearsonr(crime1['racePctAsian'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for pctRaceHisp: ', scipy.stats.pearsonr(crime1['racePctHisp'], crime1['ViolentCrimesPerPop'])[0])
Correlation coefficient for pctraceblack:  0.6238334896507505
Correlation coefficient for pctRaceWhite:  -0.676357463352348
Correlation coefficient for pctRaceAsian:  0.03604447688047008
Correlation coefficient for pctRaceHisp:  0.26451715732322045

Plot linear regression plot for population race and violent crimes

In [75]:
sns.lmplot(x='racepopblack', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='racePopWhite', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='racePopAsian', y='ViolentCrimesPerPop', data=crime)
sns.lmplot(x='racePopHisp', y='ViolentCrimesPerPop', data=crime)
Out[75]:
<seaborn.axisgrid.FacetGrid at 0x25c5bd36490>
In [76]:
# calculate correlation coefficient for population race and violent crimes
print('Correlation coefficient for popraceblack: ', scipy.stats.pearsonr(crime1['racepopblack'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for popRaceWhite: ', scipy.stats.pearsonr(crime1['racePopWhite'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for popRaceAsian: ', scipy.stats.pearsonr(crime1['racePopAsian'], crime1['ViolentCrimesPerPop'])[0])
print('Correlation coefficient for popRaceHisp: ', scipy.stats.pearsonr(crime1['racePopHisp'], crime1['ViolentCrimesPerPop'])[0])
Correlation coefficient for popraceblack:  nan
Correlation coefficient for popRaceWhite:  0.19103526091282771
Correlation coefficient for popRaceAsian:  0.14443796853542856
Correlation coefficient for popRaceHisp:  0.16900747491190238

Plot linear regression plots for violent crimes vs. non-violent crimes by region

In [77]:
sns.lmplot(x='ViolentCrimesPerPop', y='nonViolPerPop', data=crime,
           fit_reg=True, #  regression line
           hue='Region',x_jitter=.1, y_jitter=0.1)   # Color by Region
Out[77]:
<seaborn.axisgrid.FacetGrid at 0x25c5b8d7040>

Correlation matrix and Multiple Linear Regression model

Plot heatmap

In [93]:
crime.corr()
Out[93]:
Unnamed: 0 countyCode communityCode fold population householdsize racepctblack racePctWhite racePctAsian racePctHisp ... PopEmploy PopUnemployed PopHousOccup PopHousOwnOcc PopVacantBoarded PopVacMore6Mos racepopblack racePopWhite racePopAsian racePopHisp
Unnamed: 0 1.000000 0.330027 0.098597 0.026590 0.002961 -0.085890 0.168589 -0.093145 -0.118527 -0.098382 ... 0.003722 0.000673 0.001322 0.015846 0.007857 -0.003181 0.019008 0.006031 -0.033660 -0.031724
countyCode 0.330027 1.000000 0.124997 -0.060262 0.080867 -0.032992 0.219294 -0.173580 -0.085545 -0.088787 ... 0.087084 0.049108 0.082187 0.093037 0.046351 0.062575 0.091435 0.079932 0.028103 -0.013803
communityCode 0.098597 0.124997 1.000000 0.004526 -0.034680 0.002698 -0.013897 0.014749 0.033570 0.000687 ... -0.030735 -0.049584 -0.034231 -0.019389 -0.040014 -0.037967 -0.050314 -0.018727 -0.005206 -0.026689
fold 0.026590 -0.060262 0.004526 1.000000 -0.044338 0.015973 -0.040064 0.022973 0.004439 0.035620 ... -0.043224 -0.045374 -0.044404 -0.039704 -0.048556 -0.048266 -0.047387 -0.039264 -0.042699 -0.036323
population 0.002961 0.080867 -0.034680 -0.044338 1.000000 -0.018841 0.135641 -0.184685 0.088360 0.094048 ... 0.997420 0.980257 0.999602 0.960874 0.817184 0.969573 0.920310 0.985310 0.881461 0.915035
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
PopVacMore6Mos -0.003181 0.062575 -0.037967 -0.048266 0.969573 -0.028066 0.162980 -0.180756 0.037435 0.062449 ... 0.954669 0.974549 0.967702 0.922105 0.878518 1.000000 0.961107 0.941060 0.794884 0.836116
racepopblack 0.019008 0.091435 -0.050314 -0.047387 0.920310 -0.025811 0.261235 -0.252400 0.023510 0.036930 ... 0.899555 0.956437 0.917470 0.873860 0.906417 0.961107 1.000000 0.856934 0.722720 0.754417
racePopWhite 0.006031 0.079932 -0.018727 -0.039264 0.985310 -0.037977 0.086765 -0.137822 0.087790 0.089963 ... 0.989585 0.941981 0.984765 0.963334 0.770024 0.941060 0.856934 1.000000 0.865640 0.901571
racePopAsian -0.033660 0.028103 -0.005206 -0.042699 0.881461 0.022648 0.043137 -0.162957 0.277567 0.108268 ... 0.888019 0.833330 0.888139 0.809605 0.564153 0.794884 0.722720 0.865640 1.000000 0.879739
racePopHisp -0.031724 -0.013803 -0.026689 -0.036323 0.915035 0.062769 0.049429 -0.157381 0.095645 0.216809 ... 0.918027 0.890480 0.917507 0.849623 0.626975 0.836116 0.754417 0.901571 0.879739 1.000000

163 rows × 163 columns

A lot of variables are present. Will tidy this up by subsetting columns into a new dataframe

In [94]:
# will leave out age group from this subset, since they did not have strong correlation
crimedata = crime[['PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
                  'PctUnemployed', 'PctEmploy',
                   'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctVacMore6Mos',
                   'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp',
                   'nonViolPerPop', 'ViolentCrimesPerPop']]
In [95]:
crimedata.corr()
Out[95]:
PctLess9thGrade PctNotHSGrad PctBSorMore PctUnemployed PctEmploy PctHousOccup PctHousOwnOcc PctVacantBoarded PctVacMore6Mos racepctblack racePctWhite racePctAsian racePctHisp nonViolPerPop ViolentCrimesPerPop
PctLess9thGrade 1.000000 0.927560 -0.577203 0.657108 -0.531317 -0.144882 -0.358801 0.322437 0.209439 0.244873 -0.458497 -0.109690 0.635955 0.303679 0.371422
PctNotHSGrad 0.927560 1.000000 -0.751544 0.724004 -0.617251 -0.207866 -0.378211 0.416527 0.283349 0.367293 -0.494350 -0.182692 0.493895 0.387613 0.467596
PctBSorMore -0.577203 -0.751544 1.000000 -0.545808 0.393518 0.179709 0.190965 -0.296578 -0.220493 -0.188492 0.217388 0.262881 -0.245779 -0.282542 -0.299898
PctUnemployed 0.657108 0.724004 -0.545808 1.000000 -0.676430 -0.261544 -0.394427 0.549636 0.299310 0.441598 -0.539588 -0.133305 0.416788 0.408442 0.483441
PctEmploy -0.531317 -0.617251 0.393518 -0.676430 1.000000 0.341939 0.236908 -0.342344 -0.372459 -0.298779 0.282903 0.195498 -0.161893 -0.329996 -0.317644
PctHousOccup -0.144882 -0.207866 0.179709 -0.261544 0.341939 1.000000 0.171256 -0.182542 -0.274189 -0.204498 0.153423 0.177288 -0.073658 -0.309280 -0.256836
PctHousOwnOcc -0.358801 -0.378211 0.190965 -0.394427 0.236908 0.171256 1.000000 -0.221876 0.138628 -0.345849 0.449833 -0.078754 -0.251056 -0.466257 -0.455359
PctVacantBoarded 0.322437 0.416527 -0.296578 0.549636 -0.342344 -0.182542 -0.221876 1.000000 0.366664 0.521610 -0.487907 -0.113272 0.151015 0.343413 0.479910
PctVacMore6Mos 0.209439 0.283349 -0.220493 0.299310 -0.372459 -0.274189 0.138628 0.366664 1.000000 0.190708 -0.033292 -0.323736 -0.122969 -0.017010 0.030769
racepctblack 0.244873 0.367293 -0.188492 0.441598 -0.298779 -0.204498 -0.345849 0.521610 0.190708 1.000000 -0.820605 -0.089300 -0.063911 0.484853 0.628368
racePctWhite -0.458497 -0.494350 0.217388 -0.539588 0.282903 0.153423 0.449833 -0.487907 -0.033292 -0.820605 1.000000 -0.276474 -0.408489 -0.487033 -0.676849
racePctAsian -0.109690 -0.182692 0.262881 -0.133305 0.195498 0.177288 -0.078754 -0.113272 -0.323736 -0.089300 -0.276474 1.000000 0.198439 -0.037223 0.031949
racePctHisp 0.635955 0.493895 -0.245779 0.416788 -0.161893 -0.073658 -0.251056 0.151015 -0.122969 -0.063911 -0.408489 0.198439 1.000000 0.174438 0.253596
nonViolPerPop 0.303679 0.387613 -0.282542 0.408442 -0.329996 -0.309280 -0.466257 0.343413 -0.017010 0.484853 -0.487033 -0.037223 0.174438 1.000000 0.675374
ViolentCrimesPerPop 0.371422 0.467596 -0.299898 0.483441 -0.317644 -0.256836 -0.455359 0.479910 0.030769 0.628368 -0.676849 0.031949 0.253596 0.675374 1.000000

Looks much better

In [96]:
# load dataframe into a variable
crimedata_corr = crimedata.corr()
In [98]:
# make the correlation matrix plot
plt.figure(figsize=(12,10))
sns.heatmap(crimedata_corr,annot=True,vmin=-1.0)

# save image
plt.savefig('CrimeHeatmap.png')

This is so much concise and takes less time to generate. Correlation coefficients can be easily viewed for any pair of variables

In [99]:
sns.pairplot(crimedata)
Out[99]:
<seaborn.axisgrid.PairGrid at 0x25c5c482f40>

Multiple Linear Regression

Dependent Variable: ViolentCrimesPerPop

In [100]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model, preprocessing

# Subset data
X = crime1[['PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
                  'PctUnemployed', 'PctEmploy',
                   'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctVacMore6Mos',
                   'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp',
                   'nonViolPerPop']]
y = crime1['ViolentCrimesPerPop']

# train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 123)
In [101]:
# check size
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(1426, 14) (1426,)
(476, 14) (476,)
In [103]:
# fit a model
lm = linear_model.LinearRegression(normalize=True)

# train model on test data
model = lm.fit(X_train, y_train)

# Evaluate model
print(model.score(X_train, y_train))

# Use model to make predictions
y_pred = lm.predict(X_test)

# Calculate coeffcient and intercept
coefficients = model.coef_
intercepts = model.intercept_
0.6672082286337916
In [104]:
plt.scatter(y_test, y_pred)
Out[104]:
<matplotlib.collections.PathCollection at 0x25c66efb160>
In [105]:
# Use statsmodel
X = sm.add_constant(X)
model2 = sm.OLS(y_train,X_train).fit()
print (model2.summary())
                                 OLS Regression Results                                 
========================================================================================
Dep. Variable:     ViolentCrimesPerPop   R-squared (uncentered):                   0.825
Model:                             OLS   Adj. R-squared (uncentered):              0.823
Method:                  Least Squares   F-statistic:                              475.1
Date:                 Sat, 28 Nov 2020   Prob (F-statistic):                        0.00
Time:                         00:11:55   Log-Likelihood:                         -10361.
No. Observations:                 1426   AIC:                                  2.075e+04
Df Residuals:                     1412   BIC:                                  2.082e+04
Df Model:                           14                                                  
Covariance Type:             nonrobust                                                  
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
PctLess9thGrade    -44.6364      5.250     -8.502      0.000     -54.936     -34.337
PctNotHSGrad        36.2436      3.897      9.299      0.000      28.598      43.889
PctBSorMore          6.3628      1.416      4.493      0.000       3.585       9.141
PctUnemployed        2.6276      6.289      0.418      0.676      -9.710      14.965
PctEmploy            4.8657      1.673      2.908      0.004       1.583       8.148
PctHousOccup        -3.5006      1.919     -1.824      0.068      -7.265       0.264
PctHousOwnOcc       -1.8796      0.831     -2.261      0.024      -3.510      -0.249
PctVacantBoarded    22.9530      3.607      6.363      0.000      15.877      30.029
PctVacMore6Mos      -1.6623      0.833     -1.994      0.046      -3.297      -0.027
racepctblack         9.1377      2.082      4.389      0.000       5.053      13.222
racePctWhite        -4.6117      1.799     -2.563      0.010      -8.141      -1.083
racePctAsian         5.3465      3.036      1.761      0.078      -0.609      11.302
racePctHisp          5.7942      1.321      4.385      0.000       3.202       8.386
nonViolPerPop        0.0776      0.004     18.508      0.000       0.069       0.086
==============================================================================
Omnibus:                      400.287   Durbin-Watson:                   2.035
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2305.995
Skew:                           1.179   Prob(JB):                         0.00
Kurtosis:                       8.766   Cond. No.                     4.08e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.08e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [106]:
# show values
print("The coeffcient of our model is: ", coefficients[0])
print("The intercept for our model is: ", intercepts)
print ("Linear model Train dataset score is: ", model.score(X_train,y_train))
print ("Linear model Test dataset score is: ", model.score(X_test,y_test))
The coeffcient of our model is:  -38.65614796641967
The intercept for our model is:  1525.2737559291973
Linear model Train dataset score is:  0.6672082286337916
Linear model Test dataset score is:  0.6479427578855034

Compute cross-validation score

In [107]:
from sklearn.model_selection import cross_val_score

cv_results = cross_val_score(lm, X, y, cv=10)
print(cv_results)
print('Accuracy of model: ', np.mean(cv_results))
[0.82307649 0.66620002 0.06807479 0.22156865 0.56695627 0.70258549
 0.72725476 0.52135816 0.67833606 0.62544576]
Accuracy of model:  0.5600856452100784

Ridge regression

In [108]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=0.1, normalize=True)
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)
ridge.score(X_test, y_test)
Out[108]:
0.6438914111720577

Lasso regression

In [109]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.1, normalize=True)
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)
lasso.score(X_test, y_test)
Out[109]:
0.6461414763864808

Dependent Variable: NonViolCrime

In [110]:
# Subset data
X = crime1[['PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
                  'PctUnemployed', 'PctEmploy',
                   'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctVacMore6Mos',
                   'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp',
                   'ViolentCrimesPerPop']]
y = crime1['nonViolPerPop']

# train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 123)
In [111]:
# check size
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(1426, 14) (1426,)
(476, 14) (476,)
In [112]:
# fit a model
lm = linear_model.LinearRegression(normalize=True)

# train model on test data
model = lm.fit(X_train, y_train)

# Evaluate model
print(model.score(X_train, y_train))

# Use model to make predictions
y_pred = lm.predict(X_test)

# Calculate coeffcient and intercept
coefficients = model.coef_
intercepts = model.intercept_
0.5149764194147157
In [113]:
plt.scatter(y_test, y_pred)
Out[113]:
<matplotlib.collections.PathCollection at 0x25c68acceb0>
In [114]:
# Use statsmodel
X = sm.add_constant(X)
model2 = sm.OLS(y_train,X_train).fit()
print(model2.summary())
                                 OLS Regression Results                                
=======================================================================================
Dep. Variable:          nonViolPerPop   R-squared (uncentered):                   0.879
Model:                            OLS   Adj. R-squared (uncentered):              0.877
Method:                 Least Squares   F-statistic:                              729.6
Date:                Sat, 28 Nov 2020   Prob (F-statistic):                        0.00
Time:                        00:12:04   Log-Likelihood:                         -12842.
No. Observations:                1426   AIC:                                  2.571e+04
Df Residuals:                    1412   BIC:                                  2.579e+04
Df Model:                          14                                                  
Covariance Type:            nonrobust                                                  
=======================================================================================
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
PctLess9thGrade       -16.9689     30.665     -0.553      0.580     -77.123      43.185
PctNotHSGrad            6.8982     22.874      0.302      0.763     -37.972      51.768
PctBSorMore           -10.3915      8.121     -1.280      0.201     -26.322       5.539
PctUnemployed          78.0013     35.774      2.180      0.029       7.826     148.176
PctEmploy             -13.7820      9.554     -1.443      0.149     -32.524       4.960
PctHousOccup          -28.4605     10.920     -2.606      0.009     -49.882      -7.039
PctHousOwnOcc         -28.6547      4.683     -6.119      0.000     -37.840     -19.469
PctVacantBoarded        0.6076     20.845      0.029      0.977     -40.282      41.497
PctVacMore6Mos        -18.1311      4.731     -3.833      0.000     -27.411      -8.851
racepctblack          102.3185     11.628      8.799      0.000      79.508     125.129
racePctWhite           92.7819      9.972      9.304      0.000      73.221     112.343
racePctAsian           70.7048     17.214      4.107      0.000      36.938     104.472
racePctHisp            31.1728      7.534      4.138      0.000      16.395      45.951
ViolentCrimesPerPop     2.5173      0.136     18.508      0.000       2.250       2.784
==============================================================================
Omnibus:                      802.609   Durbin-Watson:                   2.028
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            22170.196
Skew:                           2.082   Prob(JB):                         0.00
Kurtosis:                      21.862   Cond. No.                         610.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Compute cross-validation score

In [115]:
cv_results = cross_val_score(lm, X, y, cv=10)
print(cv_results)
print('Accuracy of model: ', np.mean(cv_results))
[ 0.54706795  0.27731896 -0.00212999  0.3013098   0.02728441  0.6336228
  0.67041345  0.28479772  0.47252852  0.59192529]
Accuracy of model:  0.38041389117960617

Ridge regression

In [116]:
ridge = Ridge(alpha=0.1, normalize=True)
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)
ridge.score(X_test, y_test)
Out[116]:
0.5367876274904562

Lasso regression

In [117]:
lasso = Lasso(alpha=0.1, normalize=True)
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)
lasso.score(X_test, y_test)
Out[117]:
0.5419572432530835
In [ ]: